Load Auto data
data("Auto")
a) Which of the predictors are quantitative and which are qualitative?
b) What is the range of each quantitative predictor?
#summary function is quick way to see the range (min and max) as below.
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
# alternatively to just get the min and max in a table could do something like:
# This creates a dataframe called r_min from the Auto dataframe, then uses the summarise function. Without the
# across() function summarise would apply the 'min' function to the whole dataframe.
# By using the across() function it tells summarise to apply the function to each
# column. Next repeat the process creating r_max. Then create Auto_ranges dataframe
# putting together the two(r_min and r_max) using bind_rows(). Finally add a column
# labeled 'range' using mutate() and filling in the values with a simple vector c().
{
r_min <- Auto %>%
summarise(across(mpg:year, min))
r_max <- Auto %>%
summarise(across(mpg:year, max))
Auto_ranges <- bind_rows(r_min, r_max) %>%
mutate(range = c("minimum", "maximum"))
print(Auto_ranges)
}
## mpg cylinders displacement horsepower weight acceleration year range
## 1 9.0 3 68 46 1613 8.0 70 minimum
## 2 46.6 8 455 230 5140 24.8 82 maximum
c) What is the mean and standard deviation of each quantitative predictor?
#There is a conflict when using both dplyr and MASS packages.
#specify dplyr::select when both are loaded to resolve the conflict.
# take the Auto dataframe and select only the columns from mpg through year
# then pass this to the tbl_summary function. The 'type' argument specifies
# the variable type (ex continuous, categorical, etc). The function will default
# to an appropriate type however here we wish to override the default regarding
# our data for cylinders to make sure it is classified as continuous. The next
# argument in the tbl_summary function is 'statistic' and we ask it to find the
# mean and standard deviations for all continuous variables.
Auto %>%
dplyr::select(mpg:year) %>%
tbl_summary(type=list(cylinders~"continuous"),
statistic = list(all_continuous() ~ "{mean},{sd}"))
| Characteristic | N = 3921 |
|---|---|
| mpg | 23,8 |
| cylinders | 5,2 |
| displacement | 194,105 |
| horsepower | 104,38 |
| weight | 2,978,849 |
| acceleration | 15.54,2.76 |
| year | 76,4 |
| 1 Mean,SD | |
d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
# Select() only the columns of interest. Slice() only the rows of interest. Use
# tbl_summary function as above adding the min and max functions to be calculated
# as well.
Auto %>%
dplyr::select(mpg:year) %>%
slice(c(1:9, 86:392)) %>%
tbl_summary(type=list(cylinders~"continuous"),
statistic = list(all_continuous() ~ "{min}, {max},{mean},{sd}"))
| Characteristic | N = 3161 |
|---|---|
| mpg | 11, 47,24,8 |
| cylinders | 3, 8,5,2 |
| displacement | 68, 455,187,100 |
| horsepower | 46, 230,101,36 |
| weight | 1,649, 4,997,2,936,811 |
| acceleration | 8.50, 24.80,15.73,2.69 |
| year | 70, 82,77,3 |
| 1 Range,Mean,SD | |
(e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.
# mpg vs weight/origin/cylinder/acceleration/year
# acceleration vs cylinder/displacement/weight/origin
ggplot(data = Auto, aes(x=year, y=mpg, color = cylinders)) + geom_point() + facet_wrap(vars(origin)) + labs(title = "mpg by year", subtitle = "comparing country of origin")
ggplot(data = Auto, aes(x=year, y=acceleration, color = weight)) + geom_point() + facet_grid(cols=vars(cylinders), rows = vars(origin)) + labs(title = "acceleration by year", subtitle = "comparing cylinders and country of origin")
ggplot(data = Auto, aes(x=weight, y=acceleration, color = displacement)) + geom_point() + labs(title = "acceleration by weight")
ggplot(data = Auto, aes(x=horsepower, y=acceleration, color = mpg)) + geom_point() + labs(title = "acceleration by horsepower")
ggplot(data = Auto, aes(x=weight, y=horsepower, color = displacement)) + geom_point() + facet_wrap(vars(cylinders)) + labs(title = "horsepower by weight", subtitle = "comparing by cylinders")
(f) Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.
ggplot(data = Auto, aes(x=acceleration, y=mpg, color = year)) + geom_point() + facet_wrap(vars(origin)) + labs(title = "mpg by acceleration", subtitle = "comparing country of origin")
Just FYI:
# Create a 'make' variable by removing first part of name
# add a column named 'make' by applying the function gsub to the current name
# column observations. gsub() looks for a pattern of a character string and
# replaces with something else you specify. The first argument is the pattern
# here specified as a space followed by anything (.* meaning wildcard). The second
# argument replaces it in this case with nothing (the empty quotes ""), the
# last argument of the function is the data or x (Auto$name column).
Auto$make <- gsub(" .*","",Auto$name)
# Create a table from the make colum and sort
table(Auto$make) %>% sort()
##
## capri chevroelt hi mercedes nissan
## 1 1 1 1 1
## toyouta triumph vokswagen bmw cadillac
## 1 1 1 2 2
## maxda mercedes-benz chevy renault opel
## 2 2 3 3 4
## saab subaru chrysler volvo vw
## 4 4 6 6 6
## audi fiat peugeot mazda oldsmobile
## 7 8 8 10 10
## mercury honda volkswagen pontiac buick
## 11 13 15 16 17
## datsun toyota amc dodge plymouth
## 23 25 27 28 31
## chevrolet ford
## 43 48
# Create a column 'Ford01' from the table above and fill in the Ford01 col
# with either a 1 or 0 using the ifelse().
Auto$Ford01 <- ifelse(Auto$make=='ford',1,0)
ggplot(Auto,aes(mpg,Ford01))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
*** # just FYI: how to use LASSO with real example # Use Lasso to
predict Ford
Xmatrix <- as.matrix(subset(Auto,select=c("mpg","cylinders",
"horsepower","weight","acceleration","year")))
# The function cv.glmnet (cross validation generalized linear model with a net think
# fishing net) is like a lasso function. First argument is x (variables as noted above), second argument
# y (response chosen here as Ford01, supported ('...' = not required) argument 'family' specified the response # variable as a binomial. The fourth argument is type.measure specifies which loss equation to use.
# See cv.glmnet help file for details but apparently 5 options and though not all available depending on the
# model and in this example "auc" is for two-class logistic regression only and gives area under the
# reciever operator curve.
lassoFit <- cv.glmnet(x=Xmatrix,
y=Auto$Ford01,
family="binomial",type.measure="auc")
plot(lassoFit)
glm(Auto$Ford01~Xmatrix,family='binomial') %>% summary()
##
## Call:
## glm(formula = Auto$Ford01 ~ Xmatrix, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2492 -0.5832 -0.3845 -0.2487 2.6680
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4659295 4.5988133 0.101 0.91930
## Xmatrixmpg -0.1835521 0.0614842 -2.985 0.00283 **
## Xmatrixcylinders 0.0561016 0.2260124 0.248 0.80396
## Xmatrixhorsepower -0.0415583 0.0147256 -2.822 0.00477 **
## Xmatrixweight 0.0006789 0.0006275 1.082 0.27930
## Xmatrixacceleration -0.1835041 0.1118192 -1.641 0.10078
## Xmatrixyear 0.0842176 0.0628285 1.340 0.18010
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 291.47 on 391 degrees of freedom
## Residual deviance: 261.13 on 385 degrees of freedom
## AIC: 275.13
##
## Number of Fisher Scoring iterations: 6
Read about the data set: > ?Boston How many rows are in this data set? How many columns? What do the rows and columns represent?
| Column Name | Description |
|---|---|
| crim | per capita crime rate by town. |
| zn | proportion of residential land zoned for lots over 25,000 sq.ft. |
| indus | proportion of non-retail business acres per town. |
| chas | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). |
| nox | nitrogen oxides concentration (parts per 10 million). |
| rm | average number of rooms per dwelling. |
| age | proportion of owner-occupied units built prior to 1940. |
| dis | weighted mean of distances to five Boston employment centres. |
| rad | index of accessibility to radial highways. |
| tax | full-value property-tax rate per $10,000. |
| ptratio | pupil-teacher ratio by town. |
| lstat | lower status of the population (percent). |
| medv | median value of owner-occupied homes in $1000s. |
data("Boston")
plot(Boston)
The plots above seem to show crime rate seems to be concentrated in certain suburbs where there is lower median values, higher percent of lower status of the population, more buildings built <1940, with closer proximity to the five Boston employment centers, and some relationship around industry/NO concentration.
ggplot(data = Boston, aes(x=medv, y=crim)) + geom_point() + labs(title = "Crime Rate and Median Value")
ggplot(data = Boston, aes(x=lstat, y=crim)) + geom_point() + labs(title = "Crime Rate and % Lower Social-Economic Status")
ggplot(data = Boston, aes(x=age, y=crim)) + geom_point() + labs(title = "Crime Rate and Old Building %")
ggplot(data = Boston, aes(x=dis, y=crim)) + geom_point() + labs(title = "Crime Rate and Proximity to City Centers")
ggplot(data = Boston, aes(x=nox, y=crim)) + geom_point() + labs(title = "Crime Rate and Nitroous Oxide Levels")
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
foo<- Boston %>% filter(chas == 1) %>% count()
print(foo)
## n
## 1 35
low_medv <- Boston %>% filter(medv==(min(medv)))
print(low_medv)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
## 2 67.9208 0 18.1 0 0.693 5.683 100 1.4254 24 666 20.2 22.98 5
These two rows show elevated crime rate, low percent of lots <25k ^2 ft, 3rd quartile industrial percentage, elevated NOX, near median rm, max % of buildings older than 1940, near min distance to city centers, high taxation, high student teacher ratio, above 3rd quartile lower socio-economic status percentage.
foo<- Boston %>% filter(rm > 7) %>% count()
print(foo)
## n
## 1 64
foo<- Boston %>% filter(rm > 8) %>% count()
print(foo)
## n
## 1 13
plus8_rm <- Boston %>% filter(rm > 8)
print(plus8_rm)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 4.21 38.7
## 2 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 3.32 50.0
## 3 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 2.88 50.0
## 4 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 4.14 44.8
## 5 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 4.63 50.0
## 6 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 3.13 37.6
## 7 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 2.47 41.7
## 8 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 3.95 48.3
## 9 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 3.54 42.8
## 10 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 5.12 50.0
## 11 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 5.91 48.8
## 12 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 7.44 50.0
## 13 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 5.29 21.9
15A This problem involves the Boston data set, which we saw in the lab for this chapter. We will now try to predict lstat using the other variables in this data set. In other words, lstat is the response, and the other variables are the predictors.
# for each of these variables create a model using lm polynomial function,
# use the apply function to automate this to each variable. Apply() has first argument
# as data, second argument "MARGIN" set here to '2' specifies to apply the function by column (can be 1 or 2 or both c1, 2). The last argument is a function. It will take each column and apply the function
# which in this case is a linear model lm(). lm() takes first argument model (here is y ~ x or response variable ~ predictor variable).
lm_bost_df <- apply(Boston, 2, function(yy) summary(lm(lstat ~ yy, data = Boston)))
## Warning in summary.lm(lm(lstat ~ yy, data = Boston)): essentially perfect fit:
## summary may be unreliable
print(lm_bost_df)
## $crim
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.732 -4.682 -1.037 3.618 22.508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.28621 0.30687 36.78 <2e-16 ***
## yy 0.37826 0.03292 11.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.363 on 504 degrees of freedom
## Multiple R-squared: 0.2076, Adjusted R-squared: 0.206
## F-statistic: 132 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $zn
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.360 -4.410 -0.755 3.198 23.880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.09004 0.32199 43.76 <2e-16 ***
## yy -0.12645 0.01242 -10.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.51 on 504 degrees of freedom
## Multiple R-squared: 0.1706, Adjusted R-squared: 0.1689
## F-statistic: 103.6 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $indus
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.2297 -3.4376 -0.7839 2.6677 20.9405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.65353 0.48332 11.7 <2e-16 ***
## yy 0.62851 0.03696 17.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.698 on 504 degrees of freedom
## Multiple R-squared: 0.3646, Adjusted R-squared: 0.3633
## F-statistic: 289.2 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $chas
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.028 -5.633 -1.330 4.300 25.212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.7579 0.3289 38.791 <2e-16 ***
## yy -1.5162 1.2505 -1.212 0.226
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.138 on 504 degrees of freedom
## Multiple R-squared: 0.002908, Adjusted R-squared: 0.00093
## F-statistic: 1.47 on 1 and 504 DF, p-value: 0.2259
##
##
## $nox
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.7808 -3.3041 -0.7693 3.2082 22.0421
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.545 1.255 -6.013 3.51e-09 ***
## yy 36.413 2.215 16.443 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.767 on 504 degrees of freedom
## Multiple R-squared: 0.3491, Adjusted R-squared: 0.3478
## F-statistic: 270.4 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $rm
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.524 -4.206 -1.037 3.178 19.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.8594 2.2601 22.95 <2e-16 ***
## yy -6.2385 0.3574 -17.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.643 on 504 degrees of freedom
## Multiple R-squared: 0.3768, Adjusted R-squared: 0.3755
## F-statistic: 304.7 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $age
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2600 -3.3552 -0.1492 2.6266 25.8781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.17435 0.66856 3.252 0.00122 **
## yy 0.15281 0.00902 16.940 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.706 on 504 degrees of freedom
## Multiple R-squared: 0.3628, Adjusted R-squared: 0.3615
## F-statistic: 287 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $dis
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0628 -4.1942 -0.6736 3.4781 21.6542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.0494 0.5688 33.49 <2e-16 ***
## yy -1.6855 0.1311 -12.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.203 on 504 degrees of freedom
## Multiple R-squared: 0.247, Adjusted R-squared: 0.2455
## F-statistic: 165.3 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $rad
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.485 -4.300 -1.080 3.246 23.981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.82588 0.41171 21.44 <2e-16 ***
## yy 0.40078 0.03187 12.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.237 on 504 degrees of freedom
## Multiple R-squared: 0.2388, Adjusted R-squared: 0.2373
## F-statistic: 158.1 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $tax
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6344 -3.9245 -0.8784 2.6406 22.1961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.243415 0.699334 4.638 4.49e-06 ***
## yy 0.023049 0.001584 14.555 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.998 on 504 degrees of freedom
## Multiple R-squared: 0.2959, Adjusted R-squared: 0.2945
## F-statistic: 211.8 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $ptratio
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.845 -4.699 -1.060 3.372 23.165
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.1171 2.5320 -3.996 7.41e-05 ***
## yy 1.2338 0.1363 9.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.629 on 504 degrees of freedom
## Multiple R-squared: 0.1399, Adjusted R-squared: 0.1382
## F-statistic: 81.98 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $lstat
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.375e-14 -5.500e-17 5.500e-17 2.360e-16 1.586e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.054e-15 1.411e-16 -3.582e+01 <2e-16 ***
## yy 1.000e+00 9.714e-18 1.029e+17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.559e-15 on 504 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.06e+34 on 1 and 504 DF, p-value: < 2.2e-16
##
##
## $medv
##
## Call:
## lm(formula = lstat ~ yy, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8631 -3.5959 -0.8133 2.4069 20.3152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.55886 0.56823 44.98 <2e-16 ***
## yy -0.57276 0.02335 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.826 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
^ All appear to have p values < 0.05 apart from the ‘chas’ variable or bordering the Charles River.
sig_cols <- colnames(Boston)
sig_cols <- sig_cols[!sig_cols %in% c("lstat", "chas")]
Bost_abr <- Boston %>% dplyr::select(c(crim:indus, nox:medv))
apply(Bost_abr, 2, function(ii) {ggplot(data = Boston, aes(x=ii, y=lstat)) + geom_point() + labs(title = paste0("lstat and ", ii))})
## $crim
##
## $zn
##
## $indus
##
## $nox
##
## $rm
##
## $age
##
## $dis
##
## $rad
##
## $tax
##
## $ptratio
##
## $lstat
##
## $medv
# this bit of code is still a work in progress, the lable is not coming through properly
#multiple linear regression achieved by using the shorthand 'lstat~.' meaning y ~ all predictors.
lm_multi <- lm(lstat~.,data=Boston)
summary(lm_multi)
##
## Call:
## lm(formula = lstat ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6587 -2.2562 -0.5174 1.9144 20.1630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.751032 3.894208 9.181 < 2e-16 ***
## crim 0.048771 0.026606 1.833 0.06740 .
## zn 0.027994 0.011134 2.514 0.01224 *
## indus 0.086187 0.049448 1.743 0.08196 .
## chas 0.053266 0.701820 0.076 0.93953
## nox -1.596905 3.146058 -0.508 0.61197
## rm -2.243231 0.345799 -6.487 2.13e-10 ***
## age 0.072511 0.010126 7.161 2.92e-12 ***
## dis -0.388268 0.168700 -2.302 0.02178 *
## rad 0.152395 0.053969 2.824 0.00494 **
## tax -0.005144 0.003059 -1.682 0.09324 .
## ptratio -0.244001 0.110219 -2.214 0.02730 *
## medv -0.351623 0.032268 -10.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.829 on 493 degrees of freedom
## Multiple R-squared: 0.7193, Adjusted R-squared: 0.7124
## F-statistic: 105.3 on 12 and 493 DF, p-value: < 2.2e-16
crim, nox, and tax were previously significant, now have p >.05
Create a plot displaying the univariate regression coefficients from (a) on the x-axis, and the multiple regression coefficients from (b) on the y-axis. That is, each predictor is displayed as a single point in the plot. Its coefficient in a simple linear regression model is shown on the x-axis, and its coefficient estimate in the multiple linear regression model is shown on the y-axis.
coef_multi <- data.frame(coef(lm_multi), check.rows = T, check.names = T, stringsAsFactors = T)
coef_multi <- dplyr::filter(coef_multi, coef.lm_multi. < 35)
coef_multi <- dplyr::mutate(coef_multi, coef.lm_multi. = round(coef.lm_multi., digits = 4))
coef_simple <- data.frame(apply(Boston, 2, function(vv) coef(lm(lstat ~ vv, data = Boston))))
coef_simple <- data.frame(t(coef_simple))
coef_simple <- dplyr::select(coef_simple, vv)
coef_simple <- dplyr::mutate(coef_simple, vv = round(vv, digits = 4))
coef_simple <- dplyr::filter(coef_simple, vv != 1.0000)
coef_df <- bind_cols(coef_multi, coef_simple)
ggplot(data = coef_df, aes(x=vv, y=coef.lm_multi.)) + geom_point()+ labs(title = "simple vs multivariate linear regression coefficients", y="multivariate coefficients", x="simple coefficients")
# use form as specified above.
summary(lm(lstat~poly(rm + exp(1)), data = Boston))
##
## Call:
## lm(formula = lstat ~ poly(rm + exp(1)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.524 -4.206 -1.037 3.178 19.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6531 0.2509 50.44 <2e-16 ***
## poly(rm + exp(1)) -98.5011 5.6431 -17.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.643 on 504 degrees of freedom
## Multiple R-squared: 0.3768, Adjusted R-squared: 0.3755
## F-statistic: 304.7 on 1 and 504 DF, p-value: < 2.2e-16
# use orthoginal polynomial function mentioned in chapter 7
summary(lm(lstat~poly(rm, 2), data = Boston))
##
## Call:
## lm(formula = lstat ~ poly(rm, 2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.1269 -3.9962 -0.8329 3.1649 19.3836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.6531 0.2492 50.769 < 2e-16 ***
## poly(rm, 2)1 -98.5011 5.6062 -17.570 < 2e-16 ***
## poly(rm, 2)2 15.5200 5.6062 2.768 0.00584 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.606 on 503 degrees of freedom
## Multiple R-squared: 0.3861, Adjusted R-squared: 0.3837
## F-statistic: 158.2 on 2 and 503 DF, p-value: < 2.2e-16
# chapter 3 lab uses the I function to do non-linear transformation
rm_lm_fit1 <- lm(lstat~rm, data = Boston)
rm_lm_fit2 <- lm(lstat~rm + I(rm^2), data = Boston)
# and anova comparison of the model without and with the non-linear component
anova(rm_lm_fit1, rm_lm_fit2)
## Analysis of Variance Table
##
## Model 1: lstat ~ rm
## Model 2: lstat ~ rm + I(rm^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 16050
## 2 503 15809 1 240.87 7.6639 0.005842 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
age_lm_fit1 <- lm(lstat~age, data = Boston)
age_lm_fit2 <- lm(lstat~age + I(age^2), data = Boston)
anova(age_lm_fit1, age_lm_fit2)
## Analysis of Variance Table
##
## Model 1: lstat ~ age
## Model 2: lstat ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 16409
## 2 503 15401 1 1008 32.921 1.66e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
medv_lm_fit1 <- lm(lstat~medv, data = Boston)
medv_lm_fit2 <- lm(lstat~medv + I(medv^2), data = Boston)
anova(medv_lm_fit1, medv_lm_fit2)
## Analysis of Variance Table
##
## Model 1: lstat ~ medv
## Model 2: lstat ~ medv + I(medv^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 11739.3
## 2 503 8270.1 1 3469.3 211.01 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1